/* 

Name : upload_data.do 
Authors: Martin, Méjean, Parenti
Last update: Sept. 15, 2023 
Aim : Upload the raw customs data and run the cleansing procedure
Note: Extremely time consuming so avoid re-running

*/

do $dopath/id_group

* 1. Clean raw data
use ./firm/x_all_new_ss.dta, clear
rename pyod iso2
rename euro export

*1.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7

drop if length(siren)<=5 
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"|siren=="777777777"|siren=="888888888"|siren=="999999999"

*1.2 Keep valid pyod
* Before 1999: XU for BEL and LUX - but info on the destination country with iso2_tva in 98% of the cases 
* below a threshold, iso2 is not reported ("00") but iso2_tva can be
* in VAT number, Greece is refered to as EL
gen iso2_tva=substr(numtvaanonym,1,2)
replace iso2=iso2_tva if iso2=="XU" & (iso2_tva=="BE"|iso2_tva=="LU")
replace iso2="BE" if iso2=="XU"
replace iso2_tva="GR" if iso2_tva=="EL"

*For observations under_threshold : No pyod ==> attribute iso_tva
gen missing_iso2=missing(iso2)
replace iso2=iso2_tva if missing_iso2==1
*By doing that some invalid iso2 appears ==> dropped
drop if inlist(iso2,"FR","GL","PA","XX") // (435 observations deleted)

*1.3 Keep relevant regim
*Regime 22 : "Transfert d'un bien taxable dans l'Etat d'arrivée" has been absorbed by regim 21 after 2001
*Regime 23 "Facturation d'un travail à façon" and regime 24 "Envoi de matériaux en vue de travail à façon" ==> has been absorbed by regime 29 after 1997
*Drop flux 25 and 26 as they do not refer to a particular transaction (outside of trade statistics)
replace regdem=21 if regdem==22 
replace regdem=29 if regdem==23 |regdem==24
drop if regdem==25 | regdem==26 // not included usually, no buyer, no nc8


*1.4 : remove flows with export=0 and kgs=0
count if export==0 & kgs==0
drop if export==0 & kgs==0


*1.5 Clean CN8 products
* We want to keep only valid NC8 codes
* NC8 99500000 is a code used for small transactions and "envois groupés" 
* NC8 9880XX00 - XX being 73, 84, 99 or 85 - is a code used for large manufacturing or commercial firms which import values above 3 million euros and benefit from a simplified code grouping all products together
* L’arrêté du 26 décembre 2000 (modifié) fixe les conditions d’utilisation de la nomenclature spécifique des produits 
* 99050000 : personnal belongings of moving individuals
* 99190000 : belongings of catastrophy victims (coffin...) 
* 9930 : Refuelling
* 9931 : goods for offsghores installations/infrastructures
* Seek for more informations in the note. 
* In flux 13, non-valid cn8 account for about 0.5 percent of transactions, 97.31 of which are "99500000". 97.49% of non-valid cn8 are in flux 3 : no 99500000
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")
drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 

* 1.6 : Clean buyer number
encode(control), gen(encode2)
label define control 1 "Not found buyer's VAT number"  2 "Buyer's VAT number recoded but Not found"  3 "Recoded and verified buyer VAT number" 4 "Verified initial buyer VAT number"
label values encode2 control
gen drop=1 if encode2==1 | encode2==2 
gen drop2=1 if numtvaanonym==""
drop if drop==1
drop drop* 

* 1.7 : Save as temporary dataset
replace oblig=4 if ss==1
id_group buyer "numtvaanonym iso2"
drop num  flux iso2_tva ss missing_iso2 encode2 //payspart
save ./firm/panel_f2f_clean_19932017, replace


rename mois month
collapse (sum) export kgs unites, by(siren numtvaanonym iso2 buyer year month nc8 cpa6 regdem dept natr oblig indicmasse)

sort siren numtvaanonym iso2 buyer year month nc8
by siren numtvaanonym iso2 buyer: g _=1 if _n==1
replace _=0 if _!=1
by siren numtvaanonym iso2 buyer year month: egen first=max(_)
drop _
by siren numtvaanonym iso2 buyer: g _=1 if _n==_N
replace _=0 if _!=1
by siren numtvaanonym iso2 buyer year month: egen last=max(_)
drop _
g _=year if first==1
g __=month if first==1
by siren numtvaanonym iso2 buyer: egen ___=max(_)
by siren numtvaanonym iso2 buyer: egen ____=max(__)
tostring ___, replace
tostring ____, replace
g firstdate=___+"."+____ if length(____)==2
replace firstdate=___+".0"+____ if length(____)==1
drop _ __ ___ ____
g _=year if last==1
g __=month if last==1
by siren numtvaanonym iso2 buyer: egen ___=max(_)
by siren numtvaanonym iso2 buyer: egen ____=max(__)
tostring ___, replace
tostring ____, replace
g lastdate=___+"."+____ if length(____)==2
replace lastdate=___+".0"+____ if length(____)==1
drop _ __ ___ ____

save ./firm/panel_f2f_clean_19932017, replace


